clear all
set more off
global dir "../../../"


/*Overview 

This do file cleans AIS and Clarkson data, and saves relevant variables as ship_all_variables.dta

-Input Data 
	"aisrawdata.dta" - Primary raw AIS (Automatic Identification System) vessel tracking data
	"dwt_clarkson.dta" and "Clarkson0.csv" - Two Clarkson shipping database files with vessel characteristics
	
-Output Data 
	"ship_all_variables.dta" saved in ".\Repository\data_intermediate" directory. This dataset contains:
		1.Clean ship identifiers (IMO number, ship_id_new)
		2.Ship characteristics merged from both AIS and Clarkson data:
			-Ship name
			-Draught 
			-DWT (deadweight tonnage)
			-TEU (twenty-foot equivalent unit capacity)
		3.Intermediate datasets served as input data for z2, z3 do files 

-The code follows a 3-step cleaning process:
	1.Cleans ship identifiers (handling duplicate IMO numbers, MMSI numbers, and ship IDs)
	2.Cleans AIS data and creates intermediate datasets:
			-port_list_full (port characteristics)
			-ship_list_full (ship characteristics)
			-trip_list_full (trip-level data)
			-ais_container_sorted_yz (travel stamp data)
	3.Merges the cleaned AIS data with Clarkson vessel data, primarily matching on IMO numbers and MMSI numbers

	The code also handles various data irregularities like:
		Missing IMO numbers
		Duplicate vessel identifiers
		Irregular trip patterns (e.g., multiple arrivals/departures)
		Inconsistent ship characteristics across sources
*/


*******************************************************************************************************
*Step 1, clean AIS Ship Identifier
   *Raw data has the cases: one imo number matched with 2 ship id, or/and 2 mmsi
   *                        two imo number matched with 1 ship id
   *                        one mmsi number matched with 2 ship id and 2 mmsi
   *                        IMO missings 
   
   *as IMO is ships unique identifier, find all missings and mistakes manually and adjusted the rest
                            *See explainations in do files for details
********************************************************************************************************

**(1) Get container data, unprocessed

cd "$dir\Get ship_port_to_port.dta\raw Data"
use "aisrawdata.dta", clear
keep if type_name=="Container Ship"|type_name=="Cargo/Containership"
save temp_container,replace


**(2) Get unique identifier for ships
use temp_container,clear
keep ship_id shipname mmsi imo
   *manually fill in missing imos & adjust mistakes find  
	replace imo=9299812 if shipname=="CMA CGM NORMA"
	replace imo=9400198 if shipname=="JPO TUCANA"
	replace imo=9484560 if shipname=="CAP JACKSON" & mmsi==563052600
	replace shipname="BEA SCHULTE" if mmsi==477639500  //mmsi and dwt matches a diff ship
	
	*corresp to manuall check in (31) of this do file
    drop if mmsi==413307000 //shipname jinhaizhifeng, Cargo-Hazard A, no imo info in Marinetime website. Was using general cargo STARISTIND's IMO, no dwt
	drop if mmsi==249000016  //similar prob, shipname Alice, Cargo-Hazard A, no imo, no dwt
	drop if mmsi==416123456 // a fishing ship without name,  no imo, no dwt
	drop if mmsi==413793601 //shipname WangQian Sheng 957, Cargo, no imo, no dwt
	

duplicates drop 
collapse (first)shipname,by(ship_id mmsi imo)
save temp_list,replace

use temp_list,clear
local var "mmsi imo ship_id"
local j=0
foreach i of local var{
local j=`j'+1
   duplicates tag `i' if `i'!=., gen(id`j')
}
sum id*
   *maximum 1 duplicates
save temp,replace



**(2)Save distinct ones
use temp,clear             //5,287 cases
keep if id1==0 & id2==0 &id3==0 // 5036 case where ship_id, imo, mmsi are distinct. Left with 251 irregular case
gen ship_id_new=ship_id
drop id*
rename mmsi mmsi_1
save temp_p1,replace


**(3)Save imo number is the same, but mmsi or ship_id may changed (i.e. one imo matches with 2 mmsi or 2 shipid)
  *Verified in a saparate do file: those cases they are ships with same name and same dtw. So should be same ship.
  *So treat this as same ship, just changed mmsi or/and  ship id.
use temp,clear
drop if id1==0 & id2==0 &id3==0
keep if id2==1 //same imo number, doubt is the same ship. 244 irrgular cases
sort imo shipname
bysort imo: gen check=1 if ship_id==ship_id[_n-1]
egen pair=group(imo)
bysort pair: replace check=1 if check[_n+1]==1
save temp1,replace

        *(31) identified those with same shipid and imo number, but had changes in mmsi number
		use temp1,clear
		keep if check==1 
		sort ship_id imo mmsi
		collapse (first) mmsi shipname,by(ship_id imo)
		rename mmsi mmsi_1
		save temp11,replace

		use temp1,clear
		keep if check==1
		sort ship_id imo mmsi
		collapse (last) mmsi ,by(ship_id imo) 
		rename mmsi mmsi_2
		merge 1:1 imo ship_id using temp11  //114 obs, corr 228 out of 250 irregular cases in temp.dta
		drop _merge
		replace mmsi_2=. if mmsi_1==mmsi_2
		gen ship_id_new=ship_id
		save temp_p2,replace
		
		
		*(31) identified those with diff ship_id (imo number may or maynot be the same)
		use temp1,clear
		drop if check==1  // only 16 obs needs to be fixed
        *br               // checked, all have same shipname. 
		
		*If they also have same mmsi, think as same ship. 
		*If mmsi differs, check on Marinetime website to see what going on. Adj manually in the begining of the code.
		
		sort ship_id mmsi imo
		collapse (first) mmsi ship_id shipname,by( imo)
		save temp11,replace
		
		use temp1,clear
		drop if check==1  
		sort ship_id mmsi imo
		collapse (last) mmsi ship_id,by( imo)
		rename mmsi mmsi_2 
		rename ship_id ship_id_2
		merge 1:1 imo using temp11
		drop _merge
		rename mmsi mmsi_1
		replace mmsi_2=. if mmsi_1==mmsi_2
		replace ship_id=688563 if shipname=="BEA SCHULTE"  //share ship_id 688561 (prb due to mistake) with CAP JACKSON, mannually adj
		replace ship_id_2=688561 if shipname=="BEA SCHULTE"
		save temp_p3,replace
		
		
*(4)identify what left at line 59, 7 cases when ships has unique imo, but share mmsi or ship_id with other ships
		use temp,clear
		drop if id1==0 & id2==0 &id3==0
	    drop if id2==1 
		*br  //id1=0 id2=0 id3=1 for shipname MSC SHANGHAI got the same ship_id as MC Tokyo. Distinct mmsi and imo number. So must be a mistake
		*-->Assign MSC SHANGHAI a distinct ship_id
		gen ship_id_new=757639 if shipname=="MSC SHANGHAI"
		*br // id1=1 id2=0 id3=0 the rest 6 obs has distinct ship_id and imo number, but share mmsi with some other firms
		*-->Checked at Marinetraffic. Manually adjust
		replace mmsi=525025056 if imo==9118575 
		replace mmsi=439148424 if imo==9128312
		replace mmsi=. if imo== 9180023 //missing 
		replace mmsi=. if imo== 9161766
		
		gen flag_share_mmsi="caution" if id1==1 & id2==0 //flag those, check if use those to merge with clarkson data. Turns out not.
		rename mmsi mmsi_1
        save temp_p4,replace
		
		use temp_p1,clear
		append using temp_p2
		append using temp_p3
		append using temp_p4
		drop id*
		replace ship_id_new=ship_id if ship_id_new==.
		order imo ship_id_new ship_id ship_id_2
		save ID_ship_list_full,replace

		use ID_ship_list_full,clear
		duplicates drop imo ship_id, force  
		distinct   //6165 distinct ship_id_new and imo. Use either ship_id_new, or imo to distinct identify a ship
		
local var "mmsi_1 imo ship_id_new"
local j=0
foreach i of local var{
local j=`j'+1
   duplicates tag `i' if `i'!=., gen(id`j')
}
sum id* 
br if id1==1 // only duplicate case is in mmsi number, where here are two ships without mmsi numbers

		*==> The final data has imo, ship_id_new as unique identifiers.

	 

***********************************************************************************
*Step 2, clean AIS data
 
   *(1)Add clean ship identifier generated from step 1 to the data
   
   *(2)Get clean port-specific vars, save as port_list_full (port id, name, lontitude, latitude, country)
   
   *(3)Get clean ship-specific vars, save as ship_list_full (ship_id_new, imo, ship name, draught_max, dwt)

   *(4)Get clean trips 
      *Sort trips (Departure-Arrival), drop irregular connections
        *sort data to DADA by time stamp
		*drop irregular DAA or DDA
		   *In the case of DAAAAA, keep the first A 
		   *In the case of DDDDA, use the last D
		   
	   *(IMP!)In this step, I kept anchorage, intransit, arrival port=departure port, trips with travel time=zero
	   *  as Inga suggested that she may need those info for later analysis
	  
*****************************************************************************************

use "temp_container.dta", clear
keep if type_name=="Container Ship"|type_name=="Cargo/Containership"
   *manually fill in missing imos & adjust mistakes find  
	replace imo=9299812 if shipname=="CMA CGM NORMA"
	replace imo=9400198 if shipname=="JPO TUCANA"
	replace imo=9484560 if shipname=="CAP JACKSON" & mmsi==563052600
	replace shipname="BEA SCHULTE" if mmsi==477639500  //mmsi and dwt matches a diff ship
	
	*corresp to manuall check in (31) of this do file
    drop if mmsi==413307000 //shipname jinhaizhifeng, Cargo-Hazard A, no imo info in Marinetime website. Was using general cargo STARISTIND's IMO
	drop if mmsi==249000016  //similar prob, shipname Alice, Cargo-Hazard A, no imo
	drop if mmsi==416123456 // a fishing ship without name,  no imo
	drop if mmsi==413793601 //shipname WangQian Sheng 957, Cargo, no imo
    save temp,replace
	
	
**(2)arrange ship_id (use ID_ship_list_full produced in Step 1 (S1) do file)
	use ID_ship_list_full,clear
	rename ship_id ship_id_1  //old ship_id
	*keep imo ship_id_new
	merge 1:m imo using temp
	save temp,replace
	drop mmsi ship_id
	order ship_id* imo mmsi*
	save temp,replace  //checked wheather the assignment of ship_id_new makes sense. See S2-Check1.do


**(3) Get a list of port characteristicss
use temp,clear
keep port* country
duplicates drop port* country, force
save port_list_full, replace
   *check duplicates 
   duplicates drop port_id,force  //0 drops. no duplicates ito port_id
   
   
**(3) Get a list of ship characteristicss
   *for duplicates draught_max, we choose the max
   *for duplicates dtw, we choose the mode (among all trips)
use temp,clear
keep ship* dwt  draught_max imo mmsi_2*
bys imo: egen dwt_adj=mode(dwt)  //the most often showed up, non-zero dwt
collapse dwt_adj  (first) shipname   (max) draught_max, by(ship_id_new imo)
save ship_list_full,replace
   *check duplicates 
duplicates drop ship_id,force  //0 drops. no duplicates ito ship_id. 5175 ships
                                  


** (4) Get a list of trip specific characteristicss
use temp,clear
order ship_id_new port_id port_name move_type inTransit time_stamp port_type
keep ship_id_new port_id port_name move_type inTransit time_stamp port_type new_draught // 68 case in each trip, have two flags. Drop flag info for now to avoid duplicates
duplicates drop 
save trip_list_full,replace
        *check duplicates 
        duplicates drop ship_id port_id port_name move_type inTransit time_stamp port_type new_draught,force  //0 drops. no duplicates ito port_id


*  (5) Start to clean irregular trips		
	/* we observe two stamps for the same ship very often in cases where ships move in or out of anchorage
	likely because they lie directly next to each other

	the following procedure sort these observations so that always the one for which the arrival (dep) port aligns
	with the previous regular dep (arrival) port COMES FIRST. sometimes two duplicate cases follow each other, hence
	this needs to be done in a loop over incidences of duplicate time stamps starting with the first for each ship.
	After a given incidence has been resolved, the data is re-sorted...which takes time... 
	*/
	
	*(5.1) sort trips as ADAD...AD based on timestamps
	set more off
	use trip_list_full,clear
	sort ship_id time_stamp move_type
	duplicates tag ship_id time_stamp, g(tag)
	bysort ship_id: g cumtag = sum(tag) if tag==1
	bysort ship_id: replace cumtag = cumtag-1 if cumtag!=. & time_stamp==time_stamp[_n-1]

	summ cumtag
	global M = r(max)
	di "$M"

	sort ship_id time_stamp move_type
	bysort ship_id cumtag: g n=_n if cumtag!=.

	g sort = 2 if cumtag!=.
	sort ship_id time_stamp  n

	forvalues i=1(2)$M {
	bysort ship_id: replace sort = 1 if cumtag==`i' & port_id==port_id[_n-1] & move_type!=move_type[_n-1] & n==1
	bysort ship_id: replace sort = 1 if cumtag==`i' & n==2 & sort[_n-1]==2
	sort ship_id time_stamp sort n
	di "`i'"
	}
	save ais_container_sorted_midstep,replace 

     *(5.2) Arrange irregular combinations. DAA, or DDA
	  *In the case of DAAAAA, keep the first A 
      *In the case of DDDDA, use the last D	   
	  *g trip indicator
	g trip = 1 if move == "departure"
	sort ship_id time_stamp sort

	bysort ship_id: replace trip = sum(trip)
	bysort ship_id trip: g N = _N
	summ N
	count if N==3
	* there are 156 incidents where arrival is followed by arrival at the same port
	* I drop the later one
	sort ship_id time_stamp sort n
	bysort ship_id trip: g nn=_n
	drop if N==3 & nn==3 & move_type=="arrival"
	drop if N==1 /*incomplete trips (in the sense that the they start with arrival instead of departure*/
	drop if trip  == 0  /*there is one ship which starts with a duplicate arrival*/
	drop nn N tag cumtag n

** add one second to time stamp for tied observations that come second
g seconds = substr(time_stamp,-7,2)
destring seconds, replace
replace seconds = seconds+1 if sort==2
tostring seconds, replace
replace seconds = "0" + seconds if length(seconds)==1
g time = substr(time_stamp,1,17)+ seconds
drop seconds sort 
rename time_stamp time_stamp_original
save ais_container_sorted_yz, replace



***********************************************************************************
*Step 3, Merge with Clarkson Data  
*****************************************************************************************

*(1)Get and Clean Clarkson Data
use dwt_clarkson.dta,clear
keep dwt name owner imonumber  mmsimaritimemobileserviceidentit
rename imo imo
rename mmsi mmsi
rename name shipname
duplicates drop shipname imo mmsi,force   //checked , 7 ships without imo, mmsi, name didnt match any in AIS
destring mmsi ,replace
destring imo,replace
save tempdata,replace
	
clear all	
insheet using Clarkson0.csv,name
drop v7
rename imo imo
rename mmsi mmsi
rename name shipname  
drop ballast
duplicates drop  //3,386 droped
*manually adj
drop if draughtm=="Draught (m)"
destring imo, gen(imo_1) force  //2295 out of 88,687+2295 ships miss imo numbers; these are total ships
br if imo!="" & imo_1==.  //manually adj misrecording which could be corrected
replace imo="7217133.00" if shipname=="Vincent D. Tibbetts"
replace mmsi="367513050.00" if shipname=="Vincent D. Tibbetts"
destring imo,force replace
drop imo_1
*destring vars
local var "mmsi teu draughtm"
foreach i of local var{
destring `i',gen(`i'_1) force
sum `i'_1 if `i'!="" & `i'_1==. //check, all sum should be zero. i.e, no missrecording numbers to some strings
destring `i',replace force
drop `i'_1
}
duplicates drop shipname imo mmsi,force //checked , 6 ships without imo, mmsi, name didnt match any in AIS
save Clarkson0,replace


*merge dtw
use Clarkson0,clear
merge 1:1  shipname imo mmsi using tempdata
save temp,replace

use temp,clear
keep if _merge==1
drop dwt
drop _merge
save temp1,replace

use temp,clear
keep if _merge==2 //dwt
drop teu draughtm
drop _merge
save temp2,replace

use temp1,clear
keep if imo!=.
merge 1:m imo using temp2
save temp1,replace

use temp1,clear
keep if _merge==1
drop dwt
drop _merge
save temp2,replace

use temp1,clear
keep if _merge==2 //dwt
drop teu draughtm
drop _merge
save temp3,replace

use temp2,clear
drop if mmsi==.
merge 1:m mmsi using temp3
save temp1,replace

use temp,clear
keep if _merge==3
append using temp1
drop _merge
replace dwt=. if dwt==0 //check, passed
rename dwt dwt_clarkson
save Clarkson0,replace

   *check duplicates of imo and mmsi
   use Clarkson0,clear
   duplicates tag imo if imo!=., gen(id)
   duplicates tag mmsi if mmsi!=., gen(id2)
   
   sum id*   
    //-->imo unique identifier, mmsi has 1 duplciates. 
			 /* br if id2==1
			   shipname	imo	mmsi	teu	draughtm	id	id2
			Hedy	9212888	3.578e+08		22.22	0	1
			Lirio	9743083	3.578e+08		4.56	0	1
			 */
   //-> checked, this is a oil crude tanker, so drop 
	drop if mmsi==357840000
	drop id*
    replace shipname = upper(shipname)
	save Clarkson0,replace 


*(2) Merge Clarkson data to AIS ship ID list
    *rules: first match by IMO, if unmatched, try MMSI_1, if not, try MMSI_2, if still not, try match by shipname

use Clarkson0,clear
keep imo teu draughtm dwt
drop if imo==.
merge 1:1 imo using ID_ship_list_full
save data,replace

use data,clear
keep if _merge==3 //4,782 matched. 383 unmatched
drop _merge
save temp,replace

use data,clear
keep if _merge==2 //in AIS, not in clarkson
drop _merge
rename mmsi_1 mmsi //no missings
drop if mmsi==.
merge 1:m mmsi using Clarkson0  //1:m only due to missings. not match missings
keep if _merge==1|_merge==3 //matched 18
rename _merge a_merge
rename mmsi mmsi_1
rename mmsi_2 mmsi
save temp1,replace

drop if mmsi==.
merge 1:m mmsi using Clarkson0   //matched 0

use temp1,clear
sum a_merge if a_merge==1  //364 imo in AIS remain unmateched 
merge m:m shipname using Clarkson0 //names matches 149
br if _merge==3
sum teu dra  if _merge==3  //those matched all have missing teu and draught_m at the clarkson dataset, therefore

*--> just use the matched by imo and mmsi_1

*Append data matched by imo and mmsi_1. But before that, check if any of those matched by mmsi has flags
use temp1,clear
keep if a_merge==3
*br if flag!=""  //checked, no ships with flagged mmsi number got matched by mmsi number
drop a_merge
append using temp
duplicates drop imo,force //checked, zero drops

keep imo teu draughtm ship_id_new dwt_clarkson
save temp,replace

*(3)Merge to AIS variables (ship_list_full)
use ship_list_full,clear
merge 1:1 imo ship_id using temp
drop _merge
save ship_all_variables,replace

*(4)Manually adjust dwt missings
use ship_all_variables,clear
replace dwt_adj=dwt_clarkson if dwt_clarkson!=.
br if dwt_adj==.
replace dwt_adj=93025 if imo==9430387
drop dwt_c
rename dwt dwt
save ship_all_variables,replace

save "$dir\Repository\data_intermediate\ship_all_variables",replace 


***************************************************************************************
**clear intermediate data files 
!del *tmp*.dta
!del *temp*.dta


